import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import plotly
import cufflinks as cf
import pandoc
plotly.offline.init_notebook_mode()
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
import os
os.chdir('/Users/cbuie/PycharmProjects/sf16_ds4/challenges/02-pandas')
data = pd.read_csv('2013_Movies.csv')
data.info()
data['ReleaseDate'] = pd.to_datetime(data['ReleaseDate'], infer_datetime_format=True)
Plot domestic total gross over time.
##1rst way with sns
# sns.swarmplot(x="ReleaseDate", y="DomesticTotalGross", data=data[['ReleaseDate','DomesticTotalGross']], size=10)
##2rd way with pyplot
# x = np.array(data['ReleaseDate'])
# y = np.array(data['DomesticTotalGross'])
# plt.pyplot.figure(figsize=(12,6))
# plt.pyplot.scatter(x,y,alpha=0.8,marker='o')
##3rd way with plotly
fig = {
'data': [
{
'x': data['ReleaseDate'],
'y': data['DomesticTotalGross'],
'mode': 'markers',
'name': '2013'
}
],
'layout': {
'title':{'title': 'DTG vs Release Date'},
'xaxis': {'title': 'Date'},
'yaxis': {'title': "Domestic Total Gross"}
}
}
plotly.offline.iplot(fig )
Plot runtime vs. domestic total gross.
fig = {
'data': [
{
'x': data['DomesticTotalGross'],
'y': data['Runtime'],
'mode': 'markers',
'name': '2013'
}
],
'layout': {
'title':{'title': 'Runtime vs. DTG (m)'},
'xaxis': {'title': 'DomesticTotalGross (log scale)', 'type': 'log'},
'yaxis': {'title': "Runtime"}
}
}
plotly.offline.iplot(fig, filename='pandas/multiple-scatter' , )
Exercise 2.3
Group your data by Rating and find the average runtime and domestic total gross at each level of Rating.
by_rating_df = data.groupby(['Rating'])['Runtime','DomesticTotalGross'].mean()
print by_rating_df
Make one figure with (N=the number of MPAA ratings there are) subplots, and in each plot the release date vs the domestic total gross.
by_rating_df2 = data.groupby(['Rating','ReleaseDate'])['DomesticTotalGross'].agg(np.mean).reset_index()
by_rating_df2.set_index('ReleaseDate')
by_rating_df2['DomesticTotalGross'] = by_rating_df2['DomesticTotalGross']/1000000
by_rating_df2 = by_rating_df2.pivot(index='ReleaseDate', columns='Rating', values='DomesticTotalGross')
by_rating_df2.iplot(subplots=True, online=False,subplot_titles = True, kind='scatter',mode='markers', size='6',
title='Domestic Total Gross vs. Release Date',theme='white')
What director in your dataset has the highest gross per movie?
Alfonso Cuaron has the highest Average DTG: 274092705 (*only one movie)
by_Director_df = data.groupby(['Director'])['DomesticTotalGross'].agg([np.mean,len]).reset_index()
print by_Director_df.head(10).sort_values('mean', ascending = False)
Bin your dataset into months and make a bar graph of the mean domestic total gross by month. Error bars will represent the standard error of the mean.
Title of graph should include: Mean Domestic Total Gross by Month in 2013
Topic for consideration: what is the correct formula for the standard error of the mean? Examine the error bars and see if they are "reasonable."
The standard error of the mean (SE of the mean) estimates the variability between sample means that you would obtain if you took multiple samples from the same population. The standard error of the mean estimates the variability between samples whereas the standard deviation measures the variability within a single sample
The standard error can be calculated by taking the std/sqrt(n).
Use the standard error of the mean to determine how precisely the mean of the sample estimates the population mean. Lower values of the standard error of the mean indicate more precise estimates of the population mean. Usually, a larger standard deviation will result in a larger standard error of the mean and a less precise estimate. A larger sample size will result in a smaller standard error of the mean and a more precise estimate.
For example, you have a mean delivery time of 3.80 days with a standard deviation of 1.43 days based on a random sample of 312 delivery times. These numbers yield a standard error of the mean of 0.08 days (1.43 divided by the square root of 312). Had you taken multiple random samples of the same size and from the same population the standard deviation of those different sample means would be around 0.08 days.
from scipy import stats
data['month'] = pd.DatetimeIndex(data['ReleaseDate']).month
by_month_df = data.groupby(['month'])['DomesticTotalGross'].agg([np.mean,np.std,len, stats.sem])
by_month_df.info()
import plotly.plotly as py
import plotly.graph_objs as go
x = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']
# x = by_month_df.index
y = by_month_df['mean']
y1 = by_month_df['len']
e = by_month_df['sem']
trace1 = go.Bar(
x = x,
y = y,
error_y=dict(
type='data',
array=e
),
name='Mean GTD (m)',
marker=dict(
color='rgba(50, 171, 96, 0.6)',
line=dict(
color='rgba(50, 171, 96, 1.0)',
width=2)
)
)
# trace2 = go.Scatter(
# x=x,
# y=y1,
# name='Monthly Count',
# marker=dict(color = 'rgb(148, 103, 189)'),
# yaxis='y2'
# )
data = [trace1]
layout = go.Layout(
title='Mean Domestic Total Gross by Month in 2013',
xaxis=dict(
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Gross Domestic Profit (millions)',
titlefont=dict(
size=12,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=12,
color='rgb(107, 107, 107)'
)
),
# yaxis2=dict(
# title='Monthly Count',
# titlefont=dict(
# color='rgb(148, 103, 189)'
# ),
# tickfont=dict(
# color='rgb(148, 103, 189)'
# ),
# overlaying='y',
# side='right'
)
# )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig,)
# import plotly.plotly as py
# from plotly.graph_objs import *
# import plotly.graph_objs as go
# import pandas as pd
# by_month_df
# months = by_month_df.index
# DGP = by_month_df['mean']
# stderr = by_month_df['sem']
# data = [
# go.Bar(
# x = months,
# y = DGP,
# error_y=dict(
# type='data',
# array=stderr
# ),
# )
# ]
# layout = go.Layout( xaxis=XAxis(type='category') )
# fig = Figure( data=data, layout=layout)
# plotly.offline.iplot(fig)